library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(curl)
library(biomaRt)
library(sqldf)
# Ensembl LD API
library(httr)
library(jsonlite)
library(xml2)
library(gaston)
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR")
# *** finemapr ****
## finemapr contains: finemap, CAVIAR, and PAINTOR
# library(finemapr) # devtools::install_github("variani/finemapr")
library(roxygen2) #roxygenize()
# *** locuscomparer ****
# https://github.com/boxiangliu/locuscomparer
library(locuscomparer); #devtools::install_github("boxiangliu/locuscomparer")
# thm <- knitr::knit_theme$get("bipolar")
# knitr::knit_theme$set(thm)
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] locuscomparer_1.0.0 roxygen2_6.1.1 susieR_0.6.2.0411
## [4] gaston_1.5.4 RcppParallel_4.4.2 Rcpp_1.0.0
## [7] xml2_1.2.0 jsonlite_1.6 httr_1.4.0
## [10] sqldf_0.4-11 RSQLite_2.1.1 gsubfn_0.7
## [13] proto_1.0.0 biomaRt_2.38.0 curl_3.3
## [16] cowplot_0.9.4 plotly_4.8.0 ggplot2_3.1.0
## [19] dplyr_0.7.8 data.table_1.12.0 DT_0.5.1
## [22] readxl_1.2.0
##
## loaded via a namespace (and not attached):
## [1] Biobase_2.42.0 tidyr_0.8.2 bit64_0.9-7
## [4] viridisLite_0.3.0 assertthat_0.2.0 expm_0.999-3
## [7] stats4_3.5.1 blob_1.1.1 cellranger_1.1.0
## [10] yaml_2.2.0 progress_1.2.0 pillar_1.3.1
## [13] lattice_0.20-38 glue_1.3.0 chron_2.3-53
## [16] digest_0.6.18 colorspace_1.4-0 htmltools_0.3.6
## [19] Matrix_1.2-15 plyr_1.8.4 XML_3.98-1.16
## [22] pkgconfig_2.0.2 purrr_0.3.0 scales_1.0.0
## [25] tibble_2.0.1 IRanges_2.16.0 withr_2.1.2
## [28] BiocGenerics_0.28.0 lazyeval_0.2.1 magrittr_1.5
## [31] crayon_1.3.4 memoise_1.1.0 evaluate_0.12
## [34] tools_3.5.1 prettyunits_1.0.2 hms_0.4.2
## [37] stringr_1.3.1 S4Vectors_0.20.1 munsell_0.5.0
## [40] AnnotationDbi_1.44.0 bindrcpp_0.2.2 compiler_3.5.1
## [43] rlang_0.3.1 grid_3.5.1 RCurl_1.95-4.11
## [46] htmlwidgets_1.3 bitops_1.0-6 tcltk_3.5.1
## [49] rmarkdown_1.11 gtable_0.2.0 DBI_1.0.0
## [52] R6_2.3.0 knitr_1.21 bit_1.1-14
## [55] bindr_0.1.1 commonmark_1.7 stringi_1.2.4
## [58] parallel_3.5.1 tidyselect_0.2.5 xfun_0.4
print(paste("susieR ", packageVersion("susieR")))## [1] "susieR 0.6.2.411"
createDT <- function(DF, caption="", scrollY=400){
data <- DT::datatable(DF, caption=caption,
extensions = 'Buttons',
options = list( dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
return(data)
}
createDT_html <- function(DF, caption="", scrollY=400){
print( htmltools::tagList( createDT(DF, caption, scrollY)) )
} Use bioMart to get TSS positions for each gene
get_TSS_position <- function(gene){
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
# att <- listAttributes(mart)
# grep("transcription", att$name, value=TRUE)
TSS <- getBM(mart=mart,
attributes=c("hgnc_symbol","transcription_start_site", "version"),
filters="hgnc_symbol", values=gene)
} For each gene, get the position of top SNP (the one with the greatest effect size/Beta)
## Only the significant subset of results
Nalls_SS_sig <- read_excel("Data/Nalls2018_S2_SummaryStats.xlsx", sheet = "Data")
top_SNPs <- Nalls_SS_sig %>% group_by(`Nearest Gene`) %>% top_n(-1, `P, all studies`) #`Beta, all studies`
top_SNPs <- cbind(Coord=paste(top_SNPs$CHR,top_SNPs$BP, sep=":"), top_SNPs)
createDT(Nalls_SS_sig, caption = "Nalls (2018): Table S2. Detailed summary statistics on all nominated risk variants, known and novel")# Nall (2018) QTL Stats
# Nalls_QTL <- read_excel("Data/Nalls2018_S4_QTL-MR.xlsx")
# createDT(Nalls_QTL, caption = "Nalls (2018) Table S4. Expanded summary statistics for QTL Mendelian randomization") Get all genes surrounding the index SNP (default is 1Mb upstream + 1Mb downstream) Nalls et al. (2019) data: https://github.com/neurogenetics/meta5
get_flanking_SNPs <- function(gene, top_SNPs, bp_distance=1000000){ #1000000
topSNP_sub <- subset(top_SNPs, `Nearest Gene`==gene)
## Full dataset
filePath <- "Data/META.PD.NALLS2014.PRS.tsv"# "Data/nallsEtal2019_no23andMe.tab.txt"
# Get col names
# strsplit( readLines(filePath, n = 2), "\t")
minPos <- topSNP_sub$BP - bp_distance
maxPos <- topSNP_sub$BP + bp_distance
geneSubset <- read.csv.sql(filePath, sep="\t", stringsAsFactors=F,
sql = paste('select * from file where CHR =',topSNP_sub$CHR,
'AND POS BETWEEN', minPos, 'AND', maxPos))
geneSubset <- geneSubset %>% dplyr::rename(SNP=MarkerName)
return(geneSubset)
}
# flankingSNPs <- get_flanking_SNPs("LRRK2", top_SNPs)gaston_LD <- function(gene, flankingSNPs){
# Download portion of vcf from 1KG website
region <- paste(flankingSNPs$CHR[1],":",min(flankingSNPs$POS),"-",max(flankingSNPs$POS), sep="")
chrom <- flankingSNPs$CHR[1]
vcf_URL <- paste("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr",chrom,".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz",sep="")
system(paste("tabix -fh ",vcf_URL ,region, "> subset.vcf"))
# Import w/ gaston and further subset
bed.file <- read.vcf("subset.vcf")
bed <- select.snps(bed.file, id %in% flankingSNPs$SNP)
rm(bed.file)
# Calculate pairwise LD for all SNP combinations
ld.x <- gaston::LD(bed, lim = c(1,ncol(bed)))
# LD plot
limit <- 20
LD.plot( ld.x[1:limit,1:limit], snp.positions = bed@snps$pos[1:limit])
# Double check subsetting
LD_matrix <- ld.x[row.names(ld.x) %in% flankingSNPs$SNP, colnames(ld.x) %in% flankingSNPs$SNP]
return(LD_matrix)
}
# LD_matrix <- gaston_LD("LRRK2",flankingSNPs) susieR::N3finemapping.FINEMAP()
The I^2 statistic describes the percentage of variation across studies that seems not to be due to chance.
susie_on_gene <- function(gene, top_SNPs){
flankingSNPs <- get_flanking_SNPs(gene, top_SNPs)
### Get LD matrix
LD_matrix <- gaston_LD(gene, flankingSNPs)
## Turn LD matrix into positive semi-definite matrix
# LD_matrix2 <- ifelse(matrixcalc::is.positive.semi.definite(LD_matrix),
# LDmatrix,
# Matrix::nearPD(LD_matrix)$mat %>% as.matrix() )
## Subset summary stats to only include SNPs found in query
geneSubset <- subset(flankingSNPs, SNP %in% row.names(LD_matrix))
b <- geneSubset$Effect
se <- geneSubset$StdErr
# Run Susie
fitted_bhat <- susie_bhat(bhat = b, shat = se,
R = LD_matrix,
n = nrow(LD_matrix),
var_y = var(b),
L = 1, # Must lower to 1 if entering only 3 SNPs
scaled_prior_variance = 0.1,
estimate_residual_variance = TRUE,
estimate_prior_variance = FALSE,
standardize = TRUE)
# Format results
geneSubset$Coord <- paste(geneSubset$CHR, geneSubset$POS, sep="")
susieDF <- data.frame(SNP=names(fitted_bhat$X_column_scale_factors), PIP=fitted_bhat$pip) %>%
base::merge(subset(geneSubset, select=c("SNP","Effect","P.value", "Coord")), by="SNP")
createDT_html(susieDF, paste("susieR Results: ", gene), scrollY = 200)
return(susieDF)
}
# susieDF <- susie_on_gene("LRRK2", top_SNPs)before_after_plots <- function(susieDF){
## Before fine-mapping
geneSubset <- susieDF %>% arrange(desc(abs(Effect)))
labelSNPs <- geneSubset[1:5,]
before_plot <- ggplot(geneSubset, aes(x=Coord, y=Effect, label=SNP, color= -log(P.value) )) +
geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
geom_point() +
geom_point(data=labelSNPs[1,],
pch=21, fill=NA, size=4, colour="red", stroke=1) +
geom_segment(aes(xend=Coord, yend=0, color= -log(P.value)), alpha=.5) +
geom_text(data=labelSNPs, aes(label=SNP),
vjust="inward",hjust="inward", nudge_y=.05, nudge_x=.25) +
labs(title=paste(gene, "Before Fine Mapping",sep="\n"), y="Beta", x="BP Position",
color="-log(p-value)\nAll Studies") +
theme(plot.title = element_text(hjust = 0.5))
## After fine-mapping
susieDF %>% arrange(desc(PIP))
labelSNPs_PIP <- susieDF[1:5,]
after_plot <- ggplot(susieDF, aes(x=Coord, y=PIP, label=SNP, color= -log(P.value) )) +
geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
geom_point() +
geom_point(data=susieDF[susieDF$PIP == max(susieDF$PIP),],
pch=21, fill=NA, size=4, colour="green", stroke=1) +
geom_segment(aes(xend=Coord, yend=0, color= -log(P.value))) +
geom_text(data=labelSNPs_PIP, aes(label=SNP),
vjust="inward",hjust="inward",nudge_x = .25) +
labs(title=paste(gene, "After Fine Mapping",sep="\n"), y="PIP", x="BP Position",
color="-log(p-value)\nAll Studies") +
theme(plot.title = element_text(hjust = 0.5))#theme(legend.position="none")
plot_grid(before_plot, after_plot, nrow = 2) %>% print()
# susie_plot(fitted_bhat, y="PIP", b=b, add_bar = T)
}
# before_after_plots(susieDF)geneList <- c("LRRK2","GBAP1","SNCA","VPS13C","GCH1")#unique(top_SNPs$`Nearest Gene`)
#Nalls_SS %>% group_by(`Nearest Gene`) %>% tally() %>% subset(n>2)
fineMapped_topSNPs <- data.table()
for (gene in geneList){
cat("\n")
cat("### ", gene, "\n")
susieDF <- susie_on_gene(gene, top_SNPs)
before_after_plots(susieDF)
# Create summary table for all genes
newEntry <- cbind(data.table(Gene=gene), subset(susieDF, PIP==max(PIP)) %>% as.data.table())
fineMapped_topSNPs <- rbind(fineMapped_topSNPs, newEntry)
cat("\n")
}createDT(fineMapped_topSNPs, "Potential Causal SNPs Identified by susieR", scrollY = 200)## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html